Prediction of USD/PHP forex trading in time of COVID19. Given the current situation, PHP is still a top performer in Asia which makes traders confused. Hence, this project will try to get some insights which might help traders not to get confused. The dataset is from Investing.com, where the timeframe is 10 years 2010 to 2020 (can be changed). The transactions is daily since there is no available hourly data (preferred).
cc: Arvin
Indicators are tools that help an investor or a trader to make a decision whether to buy or sell. Technical indicators (which can be called features in this context) constructed from stock data. In this part we will create following features: Bollinger Bands, RSI, MACD, Moving Average, Return, Momentum, Change and Volatility. These technical indicators will be used since they are easy to compute given our available data.
Return will serve as a target or dependent variable. Other features will serve as independent variables.
import functions
import investpy
import matplotlib.patches as patches
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_ta as ta
import plotting
import requests
import seaborn as sns
import statsmodels.api as sm
import warnings
from xgboost import XGBClassifier # for features importance
# ARIMA
from sklearn.metrics import mean_squared_error, confusion_matrix, f1_score, accuracy_score
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from statsmodels.tsa.arima_model import ARIMA
# Tensorflow 2.0 including Keras
import tensorflow.keras as keras
# Hyper Parameters Tuning with Bayesian Optimization (> pip install bayesian-optimization)
from bayes_opt import BayesianOptimization
from tensorflow.keras.layers import Input, Flatten, TimeDistributed, LSTM, Dense, Bidirectional, Dropout, ConvLSTM2D, Conv1D, GlobalMaxPooling1D, MaxPooling1D, Convolution1D, BatchNormalization, LeakyReLU
from tensorflow.keras.models import Sequential, Model
warnings.filterwarnings('ignore')
df = investpy.get_currency_cross_historical_data(currency_cross='USD/PHP',
from_date='01/08/2010',
to_date='01/08/2020')
df.columns = ['open', 'high', 'low', 'close', 'Currency']
df.head()
df.describe()
print('No missing data') if sum(df.isna().sum()) == 0 else df.isna().sum()
# Log Return Feature
df['Return'] = df.ta.log_return().fillna(0)
# Rate of Change Feature
df['Change'] = df.ta.roc().fillna(0)
# Relative Volatility Index Feature
df['Volatility'] = df.ta.rvi()['RVI_14_4'].fillna(0)
# Moving Average, 7 days
df['MA7'] = df.ta.sma(length=7).fillna(0)
# Moving Average, 20 days
df['MA20'] = df.ta.sma(length=20).fillna(0)
# Exponential Moving Average, 7 days
df['EMA7'] = df.ta.ema(length=7).fillna(0)
# Exponential Moving Average 20 days
df['EMA20'] = df.ta.ema(length=20).fillna(0)
# Momentum
df['Momentum'] = df.ta.mom().fillna(0)
# RSI (Relative Strength Index)
df['RSI'] = df.ta.rsi().fillna(0)
# MACD - (Moving Average Convergence/Divergence)
df['MACD'] = df.ta.macd()['MACD_12_26_9'].fillna(0)
df['Signal'] = df.ta.macd()['MACDS_12_26_9'].fillna(0)
# Upper Band and Lower Band for Bollinger Bands
df['Upper_band'] = df.ta.bbands()['BBU_5'].fillna(0)
df['Lower_band'] = df.ta.bbands()['BBL_5'].fillna(0)
# Parabolic SARS
df['PSAR'] = df.ta.psar()['PSARs_0.02_0.2'].fillna(0)
# Aroon Up and Down
df['Aroon_Up'] = df.ta.aroon()['AROONU_14'].fillna(0)
df['Aroon_Down'] = df.ta.aroon()['AROOND_14'].fillna(0)
df.dropna(inplace=True)
# Saving
df.to_csv('data/forex/USDPHPdaily.csv')
df.corr()[['Return']].sort_values(by='Return', ascending=False)[:5]
Return correlated with Change and RSI.
plt.figure(figsize=(18,14))
sns.heatmap(df.corr(), annot=True, fmt='.2f')
plt.ylim(17, 0)
plt.title('Correlation Between USD/PHP Features', fontSize=15)
plt.show()
plotting.bollinger_bands(df.loc['2019-8':'2020'])
plotting.rsi(df.loc['2019-8':'2020'])
plotting.macd(df.loc['2019-8':'2020'])
plt.figure(figsize=(14,5))
plt.style.use('seaborn-whitegrid')
for i in range(1,13):
volatility = df[df.index.month==i].Return
sns.distplot(volatility, hist=False, label=i)
plt.legend(frameon=True, loc=1, ncol=3, fontsize=10, borderpad=.6, title='Months')
plt.axvline(df.Return.mean(), color='#666666', ls='--', lw=2)
plt.xticks(plt.xticks()[0] + df.Return.mean())
plt.title('USD/PHP Return by Month', fontSize=14)
plt.show()
plotting.forex_returns(df, value='Return', by='day', scatter=False)
plotting.forex_returns(df, value='close', by='month', scatter=False)
plt.figure(figsize=(16,6))
s = df.loc['2019-8':'2020']
u = s.high.ewm(7).mean()
l = s.low.ewm(7).mean()
plt.fill_between(s.index, u, l, color='#af43af', alpha=0.1, label='High / Low')
plt.plot(s.close, color='#aa43af', label='Price')
plt.plot(s.close.ewm(7).mean(), color='#ff43af', label='Moving Average (7 Days)')
plt.legend(frameon=True, loc=1, borderpad=.6)
plt.title('USD/PHP Close and High-Low', fontSize=15)
plt.show()
Machine learning algorithms, including Neural Networks heavily rely on probability in learning process. Let's check target variable Return to normality.
z = lambda x: (x - x.mean()) / x.std()
plt.hist(z(df.Return), bins=30)
plt.title('USD/PHP Return Distribution', fontSize=15)
plt.show()
plt.figure(figsize=(16,6))
sm.qqplot(df.Return, line='s', scale=1)
plt.rcParams['figure.figsize'] = [16.0, 6.0]
plt.title('Normality', fontSize=15)
plt.show()
Modeling part is all about trying different models, tweaking hyperparameters, evaluation, finding creative ways to engineer features and so on.
Baseline model would serve as a benchmark for comparing to more complex models.
def baseline_model(forex):
'''
\n\n
Input: Series or Array
Returns: Accuracy Score
Function generates random numbers [0,1] and compares them with true values
\n\n
'''
baseline_predictions = np.random.randint(0, 2, len(forex))
accuracy = accuracy_score(functions.binary(forex), baseline_predictions)
return accuracy
baseline_accuracy = baseline_model(df.Return)
print('Baseline model accuracy: {:.1f}%'.format(baseline_accuracy * 100))
base_preds = []
for i in range(1000):
base_preds.append(baseline_model(df.Return))
plt.figure(figsize=(16,6))
plt.style.use('seaborn-whitegrid')
plt.hist(base_preds, bins=50, facecolor='#4ac2fb')
plt.title('Baseline Model Accuracy', fontSize=15)
plt.axvline(np.array(base_preds).mean(), c='k', ls='--', lw=2)
plt.show()
Baseline model on average has 49.9% accuracy. We take this number as a guideline for our more complex models.
AutoRegressive Integrated Moving Average (ARIMA) is a model that captures a suite of different standard temporal structures in time series data.
We will split train and test data to evaluate performance of ARIMA model.
print('USD/PHP historical data contains {} entries'.format(df.shape[0]))
df[['Return']].head()
Let's take a look at the Autocorrelation Function below. The graph shows how time series data points correlate between each other. We should ignore first value in the graph that shows perfect correlation (value = 1), because it tells how data point is correlated to itself. What's important in this graph is how first data point is correlated to second, third and so on. We can see that it's so weak, it's close to zero. What does it mean to our analysis? It means that ARIMA is pretty much useless here, because it uses previous data points to predict following.
plt.rcParams['figure.figsize'] = (16, 3)
plot_acf(df.Return, lags=range(300))
plt.show()
To make a conclusion we're going to try different orders and see how well they perform on a given data.
# ARIMA orders
orders = [(0,0,0),(1,0,0),(0,1,0),(0,0,1),(1,1,0)]
# Splitting into train and test sets
train = list(df['Return'][:1900].values)
test = list(df['Return'][1900:].values)
all_predictions = {}
for order in orders:
try:
# History will contain original train set,
# but with each iteration we will add one datapoint
# from the test set as we continue prediction
history = train.copy()
order_predictions = []
for i in range(len(test)):
model = ARIMA(history, order=order) # defining ARIMA model
model_fit = model.fit(disp=0) # fitting model
y_hat = model_fit.forecast() # predicting 'return'
order_predictions.append(y_hat[0][0]) # first element ([0][0]) is a prediction
history.append(test[i]) # simply adding following day 'return' value to the model
print('Prediction: {} of {}'.format(i+1,len(test)), end='\r')
accuracy = accuracy_score(
functions.binary(test),
functions.binary(order_predictions)
)
print(' ', end='\r')
print('{} - {:.1f}% accuracy'.format(order, round(accuracy, 3)*100), end='\n')
all_predictions[order] = order_predictions
except:
print(order, '<== Wrong Order', end='\n')
pass
# Big Plot
fig = plt.figure(figsize=(16,4))
plt.plot(test, label='Test', color='#4ac2fb')
plt.plot(all_predictions[(1,1,0)], label='Predictions', color='#ff4e97')
plt.legend(frameon=True, loc=1, ncol=1, fontsize=10, borderpad=.6)
plt.title('Arima Predictions', fontSize=15)
plt.xlabel('Days', fontSize=13)
plt.ylabel('Returns', fontSize=13)
# Arrow
plt.annotate('',
xy=(15, 0.05),
xytext=(150, .1),
fontsize=10,
arrowprops={'width':0.4,'headwidth':7,'color':'#333333'}
)
# Patch
ax = fig.add_subplot(1, 1, 1)
rect = patches.Rectangle((0,-.05), 30, .1, ls='--', lw=2, facecolor='y', edgecolor='k', alpha=.5)
ax.add_patch(rect)
# Small Plot
plt.axes([.25, 1, .2, .5])
plt.plot(test[:30], color='#4ac2fb')
plt.plot(all_predictions[(0,1,0)][:30], color='#ff4e97')
plt.tick_params(axis='both', labelbottom=False, labelleft=False)
plt.title('Lag')
plt.show()
plt.figure(figsize=(16,5))
plt.hist(df[1900:].reset_index().Return, bins=20, label='True', facecolor='#4ac2fb')
plt.hist(all_predictions[(1,1,0)], bins=20, label='Predicted', facecolor='#ff4e97', alpha=.7)
plt.axvline(0, c='k', ls='--')
plt.title('ARIMA True vs Predicted Values Distribution', fontSize=15)
plt.legend(frameon=True, loc=1, ncol=1, fontsize=10, borderpad=.6)
plt.show()
test_binary = functions.binary(df[1900:].reset_index().Return)
train_binary = functions.binary(all_predictions[(1,1,0)])
tn, fp, fn, tp = confusion_matrix(test_binary, train_binary).ravel()
accuracy = accuracy_score(test_binary, train_binary)
print("True Positive and Negative: {}".format((tp + tn)))
print("False Positive and Negative: {}".format((fp + fn)))
print("Accuracy: {:.1f}%".format(accuracy*100))
XGBoost will be used here to extract important features that will be used for neural networks. This might help to improve model accuracy and boost training. Traning will be performed on scaled usdphp dataset.
df.drop('Currency', axis=1, inplace=True)
scaled_usdphp = functions.scale(df, scale=(0,1))
X = scaled_usdphp[:-1]
y = df.Return.shift(-1)[:-1]
# Initializing and fitting a model
xgb = XGBClassifier()
xgb.fit(X, y)
important_features = pd.DataFrame({
'Feature': X.columns,
'Importance': xgb.feature_importances_}) \
.sort_values('Importance', ascending=True)
plt.figure(figsize=(16,8))
plt.style.use('seaborn-whitegrid')
plt.barh(important_features.Feature, important_features.Importance, color="#4ac2fb")
plt.title('XGboost - Feature Importance - USDPHP', fontSize=15)
plt.xlabel('Importance', fontSize=13)
plt.show()
n_steps = 21
scaled_usdphp = functions.scale(df, scale=(0,1))
X_train, \
y_train, \
X_test, \
y_test = functions.split_sequences(
scaled_usdphp.to_numpy()[:-1],
df.Return.shift(-1).to_numpy()[:-1],
n_steps,
split=True,
ratio=0.8
)
keras.backend.clear_session()
n_steps = X_train.shape[1]
n_features = X_train.shape[2]
model = Sequential()
model.add(LSTM(100, activation='relu', return_sequences=False,
input_shape=(n_steps, n_features))) # 50%
# model.add(Dropout(0.5)) # {20: 58%, 30:36%, 40:42%, 50:46%}
# model.add(LSTM(50, activation='relu', return_sequences=False)) # 42%
# model.add(Dense(100)) # 42%
model.add(Dense(50)) # 64%
# model.add(Dropout(0.5)) # {20: 42%, 30: 58%, 40: 48%%, 50: 58%}
model.add(Dense(1)) # 50%
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()
# batch_size = {8: 42%, 16: 58%, 32: 60%, 64: 58%}
model.fit(X_train, y_train, epochs=100, verbose=0,
validation_data=(X_test, y_test), use_multiprocessing=True)
plt.figure(figsize=(16,4))
plt.plot(model.history.history['loss'], label='Loss')
plt.plot(model.history.history['val_loss'], label='Val Loss')
plt.legend(loc=1)
plt.title('LSTM - Training Process')
plt.show()
pred, y_true, y_pred = functions.evaluation(
X_test, y_test, model, random=False, n_preds=50,
show_graph=True)
Network has low MSE which is good since it means that most of the predicted daily returns are near the true daily returns. For the accuracy of finding the underlying trend of the daily returns, the network exceeded the baseline which is good. However, 64% is still not enough for trading large amount of money.
keras.backend.clear_session()
n_steps = X_train.shape[1]
n_features = X_train.shape[2]
model = Sequential()
model.add(Conv1D(filters=20, kernel_size=2, activation='relu',
input_shape=(n_steps, n_features))) # 48%
model.add(MaxPooling1D(pool_size=2))
# model.add(Dropout(0.5)) # {20: 42%, 30: 58%, 40: 42%, 50: 58%}
model.add(Conv1D(filters=10, kernel_size=2, activation='relu',
input_shape=(n_steps, n_features))) # 60%
model.add(MaxPooling1D(pool_size=2))
# model.add(Dropout(0.5)) # {20: 58%, 30: 58%, 40: 58%, 50: 46%}
# model.add(Conv1D(filters=5, kernel_size=2, activation='relu',
# input_shape=(n_steps, n_features))) # 58%
# model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
# model.add(Dense(50)) # 44%
model.add(Dense(1)) # 60%
model.compile(optimizer='adam', loss='mse', metrics=['mse'])
model.summary()
# batch_size = {8: 50%, 16: 44%, 32: 60%, 64: 50%}
model.fit(X_train, y_train, epochs=25, verbose=0,
validation_data=(X_test, y_test), use_multiprocessing=True)
plt.figure(figsize=(16,4))
plt.plot(model.history.history['loss'], label='Loss')
plt.plot(model.history.history['val_loss'], label='Val Loss')
plt.legend(loc=1)
plt.title('Conv - Training Process')
plt.show()
pred, y_true, y_pred = functions.evaluation(
X_test, y_test, model, random=False, n_preds=50,
show_graph=True)
keras.backend.clear_session()
n_steps = X_train.shape[1]
n_features = X_train.shape[2]
model = Sequential()
model.add(LSTM(100, activation='relu', return_sequences=True,
input_shape=(n_steps, n_features))) # 42%
# model.add(Dropout(0.5)) # {20: 42%, 30: 58%, 40: 58%, 50: 58%}
model.add(LSTM(50, activation='relu', return_sequences=True)) # 58%
# model.add(Dropout(0.5)) # {20: 42%, 30: 42%, 40: 58%, 50: 58%}
# model.add(LSTM(25, activation='relu', return_sequences=True)) # 54%
model.add(Conv1D(filters=20, kernel_size=2, activation='relu')) # 58%
model.add(MaxPooling1D(pool_size=2))
# model.add(Dropout(0.5)) # {20: 58%, 30: 58%, 40: 42%, 50: 42%}
# model.add(Conv1D(filters=10, kernel_size=2, activation='relu')) # 44%
# model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
# model.add(Dense(100)) # 42%
model.add(Dense(50)) # 60%
# model.add(Dropout(0.5)) # {20: 42%, 30: 42%, 40: 42%, 50: 42%}
model.add(Dense(1)) # 58%
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()
# batch_size = {8: 42%, 16: 58%, 32: 60%, 64: 42%}
model.fit(X_train, y_train, epochs=25, verbose=0,
validation_data=(X_test, y_test), use_multiprocessing=True)
plt.figure(figsize=(16,4))
plt.plot(model.history.history['loss'], label='Loss')
plt.plot(model.history.history['val_loss'], label='Val Loss')
plt.legend(loc=1)
plt.title('LSTM+Conv - Training Process')
plt.show()
pred, y_true, y_pred = functions.evaluation(
X_test, y_test, model, random=False, n_preds=50,
show_graph=True)
def create_model(u1, u2, d1, filters, pool, kernel):
keras.backend.clear_session()
u1 = int(u1)
u2 = int(u2)
d1 = int(d1)
filters = int(filters)
kernel = int(kernel)
pool = int(pool)
n_steps = X_train.shape[1]
n_features = X_train.shape[2]
model = Sequential()
model.add(LSTM(u1, activation='relu', return_sequences=True,
input_shape=(n_steps, n_features)))
model.add(LSTM(u2, activation='relu', return_sequences=True))
model.add(Conv1D(filters=filters, kernel_size=kernel, activation='relu'))
model.add(MaxPooling1D(pool_size=pool))
model.add(Flatten())
model.add(Dense(d1, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse', metrics=['mse'])
model.fit(X_train, y_train, epochs=4, verbose=0,
validation_data=(X_test, y_test), use_multiprocessing=True)
score = model.evaluate(X_test, y_test, verbose=0)
return score[1]
def bayesian_optimization():
pbounds = {
'u1': (100, 250),
'u2': (25, 62),
'filters': (1, 20),
'd1': (25, 62),
'kernel': (2,10),
'pool': (2, 10)
}
optimizer = BayesianOptimization(
f = create_model,
pbounds = pbounds,
random_state = 1,
verbose = 2
)
optimizer.maximize(init_points=5, n_iter=5)
print(optimizer.max)
n_steps = 21
scaled_usdphp = functions.scale(df, scale=(0,1))
X_train, \
y_train, \
X_test, \
y_test = functions.split_sequences(
scaled_usdphp.to_numpy()[:-1],
df.Return.shift(-1).to_numpy()[:-1],
n_steps,
split=True,
ratio=0.8
)
bayesian_optimization()
n_steps = X_train.shape[1]
n_features = X_train.shape[2]
model = Sequential()
model.add(LSTM(249, activation='relu', return_sequences=True,
input_shape=(n_steps, n_features)))
model.add(LSTM(25, activation='relu', return_sequences=True))
model.add(Conv1D(filters=18, kernel_size=7, activation='relu'))
model.add(MaxPooling1D(pool_size=9))
model.add(Flatten())
model.add(Dense(61, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse', metrics=['mse'])
model.fit(X_train, y_train, epochs=100, verbose=0,
validation_data=(X_test, y_test), use_multiprocessing=True)
plt.figure(figsize=(16,4))
plt.plot(model.history.history['loss'], label='Loss')
plt.plot(model.history.history['val_loss'], label='Val Loss')
plt.legend(loc=1)
plt.show()
# Evaluation
pred, y_true, y_pred = functions.evaluation(
X_test, y_test, model, random=True, n_preds=100,
show_graph=True)
Bayesian optimization didn't improve the MSE and accuracy of the LSTM+CNN model. This may be due to the choice of parameters. For further optimzation, a wider range of parameters could be used to improve the performance of the model.